Housing data

In this first chunk, let’s read in the Census microdata. Here is some example code on how to read in the data, create new variables to categorize the rows of data into groups, and then summarize the data to create information about Louisville.

Our goal is to create variables for gender, age group, whether someone is a mother, whether someone is married, their level of education, their income, whether they are the head of household, and the number of children they have.

This code chunk will identify which households are homeowners vs. renters (in the homeownership variable) and which households are cost-burdened, meaning they pay more thatn 30% of their income toward rent or a mortgage (in the cost_burden variable).

There are also variables for severe cost burden (households that pay more than half of their income towards housing) and households with severe housing problems (lacking a kitchen, adequate plumbing, or an ample number of rooms for the number of people living there).

load("clean_svybydemog_data.RData")

#Waffle Chart

Homeownership

Single female homeownership

Ranking

# H_singFem_rank <- census_microdata081122 %>%
#   filter(
#     year == '2019',
#     earner_type == 'single_earner') %>%
#   survey_by_demog('homeownership', weight_var = "HHWT") %>%
#   filter(
#     sex == 'total',
#     race == 'total',
#     var_type == 'percent')

temp_df <- H_earntype %>% 
  filter(male_fem_mult_earn == "single_fem_earner",
         var_type == "percent", sex == "total") %>%
  mutate(sex = "total")

ranking(temp_df, 
        'homeownership',
        plot_title = "Single Earner Female Homeownership",
        caption_text = 
      "Source: Greater Louisville Project
       Data from GLP analysis of ACS microdata from IPUMS-USA")

by presence of kids

# H_s_Femkids_trend <- census_microdata081122 %>%
#                     filter(earner_type == 'single_earner') %>%
#   survey_by_demog( weight_var = "HHWT", 'homeownership', other_grouping_vars = c("kd_pres"))
  
H_s_Femkids_trend %<>%
  filter(
    var_type == 'percent',
    race == 'total',
    sex == "female") %>%
  pivot_wider(names_from = 'kd_pres', values_from = 'homeownership') %>%
  select(-sex)



trend(H_s_Femkids_trend, 
      kids:no_kids,
      rollmean = 3,
      plot_title = "Female Homeownership by Presence of Children", 
      cat = c("Children" = "kids", "No Children" = "no_kids"), 
      y_title = 'Percent',
      caption_text = 
      "Source: Greater Louisville Project
       Data from GLP analysis of ACS microdata from IPUMS-USA")

Ranking with kids

# H_sinFem_kids <-census_microdata081122 %>%
#   filter(
#     year == '2019',
#     earner_type == 'single_earner',
#     NCHILD > 0) %>%
#   survey_by_demog('homeownership', weight_var = "HHWT") %>%
#   filter(
#     sex == 'total',
#     race == 'total',
#     var_type == 'percent')

ranking(H_sinFem_kids, 'homeownership',
        plot_title = "Single Earner Female Homeownership with Children",
        title_scale = 0.8,
        caption_text = 
      "Source: Greater Louisville Project
       Data from GLP analysis of ACS microdata from IPUMS-USA")

#H_Fem_race <- survey_by_demog(census_microdata081122 ,weight_var = "HHWT", 'homeownership')

# H_Fem_race <- census_microdata081122 %>%
#   filter(sex == 'female') %>%
#   survey_by_demog( weight_var = "HHWT", 'homeownership')

temp_df <- H_earntype %>%
  filter(male_fem_mult_earn == "single_fem_earner",
         var_type == "percent", sex == "total")

trend(filter(temp_df, race != "hispanic"), 
      homeownership, 
      rollmean = 3,
      pctiles = F,
      plot_title = "Single Female Homeownership by Year", 
      cat = 'race', 
      y_title = 'Percent',
      caption_text = 
      "Source Greater Louisville Project
       Data from GLP analysis of ACS microdata from IPUMS-USA")

Income

Income Percentiles

#################KEEP################

## create hist for each of the most three recent years -> just for verification...that there isnt anything weird
## if feeling funky, use gganimate to create a gif of histograms from 2000 to 2019

#TO DO:
# Change the x-axis to a dollar format (see trendline function in trendline_helpers.R) - done
# Label the x-axis on every $50,000? - done
# Let's use counts on the y-axis for the male and female graph - done?
# Format the y-axis with commas - done
# remove legend? - done

#p$HHINCOME %>% dollar(accuracy = 0.1, scale = .001, suffix = "k") 

single_earner_pctiles <- lville_2019 %>%
  group_by(sex) %>%
  summarize(
    ten_pct = Hmisc::wtd.quantile(HHINCOME, HHWT, probs = 0.1),
    twenty_five_pct = Hmisc::wtd.quantile(HHINCOME, HHWT, probs = 0.25),
    fifty_pct = Hmisc::wtd.quantile(HHINCOME, HHWT, probs = 0.5),
    seventy_five_pct = Hmisc::wtd.quantile(HHINCOME, HHWT, probs = 0.75),
    ninety_pct = Hmisc::wtd.quantile(HHINCOME, HHWT, probs = 0.9))

library(gt) 

gt(single_earner_pctiles)
sex ten_pct twenty_five_pct fifty_pct seventy_five_pct ninety_pct
female 11200 25000 50000 91000 152000
male 18000 37000 66900 108400 170000

Income by Earner Gender

p <- lville_2019 %>% 
  filter(HHINCOME <= cut_95,
         earner_type == "single_earner") %>%
  func_plt_hist_overlay( "sex")

p <- p + glp_graph_theme

p <- p + labs(
  title = "Single Earner Income by Gender",
) +
  
  ylab(" ") +
  
  guides(color = FALSE) + 
  
  facet_wrap(~sex, nrow = 2) +
  
  theme(
  #axis.ticks.x =  element_line(size = 50000),
  strip.text = element_blank()

  )  +

scale_x_continuous(
  breaks = c(50000, 100000, 150000, 200000),
  label = c("$50k", "$100k", "$150k", "$200k")
) +
  scale_y_continuous(labels = scales::comma)

p

Single Female Income by Race

# Need help turning yaxis into percents


sing_fem_inc_race <- lville_2019 %>% 
  filter(
    sex == 'female',
    earner_type == 'single_earner',
    HHINCOME <= cut_95) %>%
  func_plt_hist_overlay( "race") 

sing_fem_inc_race <- sing_fem_inc_race + facet_wrap(~race, nrow = 2, scales = "free_y") 

sing_fem_inc_race <- sing_fem_inc_race + glp_graph_theme

sing_fem_inc_race <- sing_fem_inc_race + 
  labs(
  title = "Single Female Earner Income",
) + 
  ylab(" ") +
  
  guides(color = FALSE)

sing_fem_inc_race <- sing_fem_inc_race + 
  
  theme( 
  #axis.ticks.x =  element_line(size = 50000),
  strip.text = element_blank()
  
  )  + 
  
scale_x_continuous(
  breaks = c(50000, 100000, 150000),
  label = c("$50k", "$100k", "$150k")
) +
  scale_y_continuous(labels = scales::comma)


sing_fem_inc_race

Single Female Earners Cost Burden

# TO DO
# see above for x and y axis to-dos
# Drop the legend title (see )


cost_burden_sf <- lville_2019 %>% 
  filter(
    sex == 'female',
    earner_type == 'single_earner',
    HHINCOME <= cut_95) %>%
  mutate(
    cost_burden = factor(cost_burden, 
                         levels = rev(c(TRUE, FALSE)), 
                         labels = rev(c("Cost Burdened", "Non Cost Burdened")), 
                         ordered = TRUE))

cost_burden_sf_plot <- ggplot(cost_burden_sf, 
       aes(x = HHINCOME, fill = cost_burden, weights = HHWT), 
       alpha=0.5, 
       position = "stack", 
       binwidth = 10000) + 
geom_histogram()

cost_burden_sf_plot <- cost_burden_sf_plot + glp_graph_theme

cost_burden_sf_plot <- cost_burden_sf_plot + 
  labs(
  title = "Single Female Earner Cost Burden",
) + 
  ylab(" ") +
  xlab("Household Income") +
  
  guides(color = FALSE) +
  
  theme(
  #axis.ticks.x =  element_line(size = 50000),
  strip.text = element_blank()

  )  +

  scale_fill_manual(values = c("#0E4A99", "#F58021")) +
  
scale_x_continuous(
  breaks = c(50000, 100000, 150000, 200000),
  label = c("$50k", "$100k", "$150k", "$200k")
) +
  scale_y_continuous(labels = scales::comma)

cost_burden_sf_plot

#I_CB_earn_trend <- survey_by_demog(census_microdata081122, weight_var = "HHWT", 'cost_burden', other_grouping_vars = c('earner_type'), breakdowns = "sex")

# I_CB_earn_trend <- survey_by_demog(census_microdata081122, weight_var = "HHWT", 'cost_burden', other_grouping_vars = c('male_fem_mult_earn'))

I_CB_earn_trend %<>%
  filter(
    var_type == 'percent',
    race == 'total',
    sex == 'total') %>%
  select( -c(sex,race)) %>%
  pivot_wider(names_from = "male_fem_mult_earn", values_from = "cost_burden")
  

trend(I_CB_earn_trend, 
      multiple_earner:single_fem_earner:single_male_earner, 
      pctiles = F,
      plot_title = "Cost Burden by Earner Type",
      cat = c("Multiple Earners" = "multiple_earner", "Single Female Earner" = "single_fem_earner", "Single Male Earner" = "single_male_earner"),
      y_title = 'Percent',
      caption_text = 
      "Source Greater Louisville Project
       Data from GLP analysis of ACS microdata from IPUMS-USA")

#will need to see if x-axis label knit correctly on Harrison's 
#having issues switching to glp colors & renaming legend values

#chart_possible_colors <- c("#0E4A99", "#F58021", "#00A9B7", "#800055", "#356E39", "#CFB94C", "#7E9C80")
# chart_number_I_need = length(unique(data$group))
# chart_these_colors <- possible_colors[1:number_I_need]

I_median_earn_age <- lville_2019 %>%
  group_by(age_group, male_fem_mult_earn) %>%
  summarize(Med=median(HHINCOME)) 

I_median_earn_age_plot <- ggplot(I_median_earn_age, 
       aes(x=age_group, y=Med, fill = male_fem_mult_earn)) + 
  geom_bar(stat="identity", position='dodge') 

I_median_earn_age_plot <- I_median_earn_age_plot + glp_graph_theme

I_median_earn_age_plot <- I_median_earn_age_plot +
  labs(
  title = "Median Earnings by Earning Type",
) + 
  ylab("Household Income") +
  xlab("Age Group") +
  
  scale_y_continuous(labels = scales::dollar) +   
  scale_fill_manual(
    values = c("#0E4A99", "#F58021", "#00A9B7"), 
    labels = c("Multiple Earner", "Single Female Earner", "Single Male Earner"))

  
I_median_earn_age_plot

Education

#issues displaying the percentages of each group

E_singM_singF <- lville_2019 %>% 
  filter(earner_type == 'single_earner') %>%
  group_by(sex, educ) %>%
  summarize(n=sum(HHWT, na.rm = TRUE)) %>%
  mutate(
    total = sum(n),
    rate = n/sum(n)*100,
    educ = factor(educ, 
                  levels = rev(c("no_hs", "hs", "some_col", "assoc", "bach","grad")), 
                  ordered = TRUE))


E_singM_singF_plot <- ggplot(E_singM_singF, 
       aes(x=sex, y=rate, fill = educ)) + 
  geom_bar(stat="identity", position = "fill") 
  # geom_col(aes(fill = educ)) +
  # geom_text(aes(label = rate, position = position_stack(vjust = 0.5))
#)
  
E_singM_singF_plot <- E_singM_singF_plot + glp_graph_theme

E_singM_singF_plot <- E_singM_singF_plot + 
  
  theme(
    legend.position = "right"
    ) +

  labs(
  title = "Educational attainment by gender for single earners",
) + 
  ylab(" ") +
  xlab(" ") +
  scale_fill_discrete(labels = c("Graduate","Bachelor", "Associate", "Some College",  "High School", "No High School")) + 
  scale_x_discrete (labels = c("female" = "Female", "male" = "Male")) +
  
  scale_y_continuous(labels = scales::percent)

E_singM_singF_plot

#show percentages in bars?
#how does it look when harrison knits it?

E_singF_race <- lville_2019 %>% 
  filter(
    sex == 'female',
    earner_type == 'single_earner') %>%
  group_by(race, educ) %>%
  summarize(n=sum(HHWT, na.rm = TRUE)) %>%
  mutate(
    total = sum(n),
    rate = n/sum(n)*100,
    educ = factor(educ, 
                  levels = rev(c("no_hs", "hs", "some_col", "assoc", "bach","grad")), 
                  ordered = TRUE))

E_singF_race_plot <- ggplot(E_singF_race, aes(x=race, y=rate, fill=educ)) + 
geom_bar(stat="identity", position='fill')


E_singF_race_plot <- E_singF_race_plot + glp_graph_theme

E_singF_race_plot <- E_singF_race_plot + 
    theme(
    legend.position = "right"
    ) +
  labs(
  title = "Single Female Education Breakdown",
) + 
  ylab(" ") +
  xlab("Race") +
  scale_fill_discrete(labels = c("Graduate","Bachelor", "Associate", "Some College",  "High School", "No High School")) + 
  scale_x_discrete (labels = c("female" = "Female", "male" = "Male")) +
  
  scale_y_continuous(labels = scales::percent)

E_singF_race_plot

Household Age

#will xaxis look normal if knitted from harrison's computer?

# cost_burden_age_sf <- census_microdata081122 %>% 
#     filter(year %in% 2010:2019) %>%
#   mutate(
#     cost_burden = factor(cost_burden, 
#                          levels = rev(c(TRUE, FALSE)), 
#                          labels = rev(c("Cost Burdened", "Non Cost Burdened")), 
#                          ordered = TRUE)
#     )

cost_burden_age_sf %<>% drop_na(cost_burden) #this will need to be run once and then left alone if tweaking graphs

cost_burden_age_sf_plot <- ggplot(cost_burden_age_sf,
       aes(x=age_group, y=HHWT , fill=cost_burden),
        color="#00A9B7") +
geom_bar(stat="identity", position='fill')

cost_burden_age_sf_plot <- cost_burden_age_sf_plot + glp_graph_theme

cost_burden_age_sf_plot <- cost_burden_age_sf_plot + 
    theme(
    legend.position = "right"
    ) +
  labs(
  title = "Cost Burdened Status by Age",
) + 
  ylab(" ") +
  xlab("Race") +
  scale_fill_discrete(labels = c("Non Cost Burdened", "Cost Burdened")) + 
  #scale_x_discrete (labels = c("female" = "Female", "male" = "Male")) +
  
  scale_y_continuous(labels = scales::percent)

cost_burden_age_sf_plot

#x-axis...not legible on Josh's comp

temp_df <- cost_burden_age_sf %>%
  mutate(
    age_group = case_when(
      age %in% 15:19 ~ NA_character_, 
      age %in% 20:29 ~ "20-29", 
      age %in% 30:39 ~ "30-39",  
      age %in% 40:49 ~ "40-49",  
      age %in% 50:59 ~ "50-59",  
      age %in% 60:69 ~ "60-69", 
      age %in% 70:79 ~ "70-79", 
      age >= 80 ~ "80+"))

cost_burden_age_sf_facet_plt <- ggplot(temp_df,
       aes(x=age_group, y=HHWT , fill=cost_burden),
        color="#00A9B7") +
geom_bar(stat="identity", position='fill')+
facet_wrap(~male_fem_mult_earn)

cost_burden_age_sf_facet_plt <- cost_burden_age_sf_facet_plt + glp_graph_theme

cost_burden_age_sf_facet_plt <- cost_burden_age_sf_facet_plt + 
    theme(
    legend.position = "right",
    
    strip.text = element_text(size = 40)
    ) +
  labs(
  title = "Cost Burdened Status by Age and Earner Type",
) + 
  ylab(" ") +
  xlab(" ") +
  scale_fill_discrete(labels = c("Non Cost Burdened", "Cost Burdened")) + 
  scale_x_discrete(guide = guide_axis(n.dodge=2)) +

  scale_y_continuous(labels = scales::percent)

cost_burden_age_sf_facet_plt

Earner Types Over Time

#does this specficially need to be for louisville? 
#once data is decided, add this data frame to the saved data frame chunk at beginning of rmd file
#x-axis is crowded
#will knit look better from Harrison's

earner_trend <- census_microdata081122 %>%
  
  mutate(
    male_fem_mult_earn = case_when(
      sex == 'female' & earner_type == 'single_earner' ~ 'single_fem_earner',
      sex == 'male' & earner_type == 'single_earner' ~ 'single_male_earner',
      earner_type == 'multi_earner' ~ 'multiple_earner')
  ) %>% 
  group_by(year, male_fem_mult_earn) %>%
  summarize(n=sum(HHWT, na.rm = TRUE)) %>%
  mutate(
    total = sum(n),
    rate = n/sum(n)*100) 
  
earner_trend_plt <- ggplot(earner_trend, 
       aes(x=year, y=rate, fill=male_fem_mult_earn),
        color="#00A9B7") + 
geom_bar(stat="identity", position='fill')


earner_trend_plt <- earner_trend_plt + glp_graph_theme

earner_trend_plt <- earner_trend_plt + 
    theme(
    legend.position = "right"
    #strip.text = element_blank()
    ) +
  labs(
  title = "Earner Type Trend"
) + 
  ylab(" ") +
  xlab(" ") +
  scale_fill_discrete(labels = c("Multiple Earner", "Single Female Earner", "Single Male Earner")) + 

  scale_y_continuous(labels = scales::percent)

earner_trend_plt

Mortgage Data